#2 Extensions Build a decision tree to model the likelihood of a sale being of an organic avocado. Use k-means clustering to investigate potential relationships between the year and the average avocado price.
suppressPackageStartupMessages(library(rpart))
library(rpart.plot)
library(dplyr)
# load in the data
avocado_dt <- avocado_tidy
head(avocado_dt)
# we need to shuffle the data, which helps to ensure we are selecting random training and test sets
shuffle_index <- sample(1:nrow(avocado_dt))
avocado_dt <- avocado_dt[shuffle_index, ]
head(avocado_dt)
# with decision trees - it is not necessary to scale the data.
# we still need to remove any NAs (none in this case)
# plus do any variable reduction. We also need to do some
# variable engineering, ensuring all our column are either
# numeric type or factors.
avocado_dt$year <- NULL
avocado_dt$month <- NULL
avocado_dt$week <- NULL
# removed the date columns as the plot has too much information to plot
glimpse(avocado_dt)
Observations: 18,249
Variables: 9
$ averageprice [3m[38;5;246m<dbl>[39m[23m 1.70, 1.19, 0.59, 1.38, 1.05, 1.49, 0.88, 1.11, 1.42, 1.09, 0.…
$ x4046 [3m[38;5;246m<dbl>[39m[23m 425.58, 42405.80, 649646.72, 0.00, 6856.14, 2674.37, 83566.59,…
$ x4225 [3m[38;5;246m<dbl>[39m[23m 0.00, 119272.94, 241271.36, 1367.86, 145975.24, 1500.12, 22350…
$ x4770 [3m[38;5;246m<dbl>[39m[23m 0.00, 1020.51, 231.23, 0.00, 10046.57, 0.00, 18005.27, 108.06,…
$ small.bags [3m[38;5;246m<dbl>[39m[23m 1321.56, 65842.25, 71559.97, 1470.90, 30258.12, 243.33, 346063…
$ large.bags [3m[38;5;246m<dbl>[39m[23m 3.33, 1051.54, 217921.79, 1734.47, 34920.23, 0.00, 1781.28, 17…
$ xlarge.bags [3m[38;5;246m<dbl>[39m[23m 0.00, 57.53, 0.00, 0.00, 1307.21, 0.00, 1452.49, 0.00, 4.52, 0…
$ type [3m[38;5;246m<fct>[39m[23m organic, conventional, conventional, organic, conventional, or…
$ region [3m[38;5;246m<fct>[39m[23m NewOrleansMobile, HarrisburgScranton, MiamiFtLauderdale, Louis…
avocado_dt$region <- NULL
# had to remove region as this also contained too much information to plot
glimpse(avocado_dt)
Observations: 18,249
Variables: 8
$ averageprice [3m[38;5;246m<dbl>[39m[23m 1.70, 1.19, 0.59, 1.38, 1.05, 1.49, 0.88, 1.11, 1.42, 1.09, 0.…
$ x4046 [3m[38;5;246m<dbl>[39m[23m 425.58, 42405.80, 649646.72, 0.00, 6856.14, 2674.37, 83566.59,…
$ x4225 [3m[38;5;246m<dbl>[39m[23m 0.00, 119272.94, 241271.36, 1367.86, 145975.24, 1500.12, 22350…
$ x4770 [3m[38;5;246m<dbl>[39m[23m 0.00, 1020.51, 231.23, 0.00, 10046.57, 0.00, 18005.27, 108.06,…
$ small.bags [3m[38;5;246m<dbl>[39m[23m 1321.56, 65842.25, 71559.97, 1470.90, 30258.12, 243.33, 346063…
$ large.bags [3m[38;5;246m<dbl>[39m[23m 3.33, 1051.54, 217921.79, 1734.47, 34920.23, 0.00, 1781.28, 17…
$ xlarge.bags [3m[38;5;246m<dbl>[39m[23m 0.00, 57.53, 0.00, 0.00, 1307.21, 0.00, 1452.49, 0.00, 4.52, 0…
$ type [3m[38;5;246m<fct>[39m[23m organic, conventional, conventional, organic, conventional, or…
# create training and test sets
create_train_test <- function(data, size = 0.8, train = TRUE) {
n_row = nrow(data)
total_row = size * n_row
train_sample <- 1: total_row
if (train == TRUE) {
return (data[train_sample, ])
} else {
return (data[-train_sample, ])
}
}
avocado_dt_train <- create_train_test(avocado_dt, 0.8, train = TRUE)
avocado_dt_test <- create_train_test(avocado_dt, 0.8, train = FALSE)
# checking that our test and training sets have similar proportions
prop.table(table(avocado_dt_test$type))
conventional organic
0.5054795 0.4945205
prop.table(table(avocado_dt_train$type))
conventional organic
0.4987328 0.5012672
# It is unlikely that we'll get an exact match, but in general the gap
# between the two will be smaller with karger datasets.
avocado_dt_fit <- rpart(type ~ ., data = avocado_dt_train, method = "class")
rpart.plot(avocado_dt_fit)
Not sure why there are organic in the conventional section of the tree and why there are conventional branches in the organis section……
# Use k-means clustering to investigate potential relationships between the year and the average avocado price.
# loading in the dataset
avocado_kmc <- avocado_tidy
glimpse(avocado_kmc)
Observations: 18,249
Variables: 12
$ averageprice [3m[38;5;246m<dbl>[39m[23m 1.33, 1.35, 0.93, 1.08, 1.28, 1.26, 0.99, 0.98, 1.02, 1.07, 1.…
$ x4046 [3m[38;5;246m<dbl>[39m[23m 1036.74, 674.28, 794.70, 1132.00, 941.48, 1184.27, 1368.92, 70…
$ x4225 [3m[38;5;246m<dbl>[39m[23m 54454.85, 44638.81, 109149.67, 71976.41, 43838.39, 48067.99, 7…
$ x4770 [3m[38;5;246m<dbl>[39m[23m 48.16, 58.33, 130.50, 72.58, 75.78, 43.61, 93.26, 80.00, 85.34…
$ small.bags [3m[38;5;246m<dbl>[39m[23m 8603.62, 9408.07, 8042.21, 5677.40, 5986.26, 6556.47, 8196.81,…
$ large.bags [3m[38;5;246m<dbl>[39m[23m 93.25, 97.49, 103.14, 133.76, 197.69, 127.44, 122.05, 562.37, …
$ xlarge.bags [3m[38;5;246m<dbl>[39m[23m 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.…
$ type [3m[38;5;246m<fct>[39m[23m conventional, conventional, conventional, conventional, conven…
$ year [3m[38;5;246m<dbl>[39m[23m 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 20…
$ region [3m[38;5;246m<fct>[39m[23m Albany, Albany, Albany, Albany, Albany, Albany, Albany, Albany…
$ month [3m[38;5;246m<dbl>[39m[23m 12, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9…
$ week [3m[38;5;246m<dbl>[39m[23m 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37…
# we are interested in year so I'll remove the week and month columns
avocado_kmc$week <- NULL
avocado_kmc$month <- NULL
glimpse(avocado_kmc)
Observations: 18,249
Variables: 10
$ averageprice [3m[38;5;246m<dbl>[39m[23m 1.33, 1.35, 0.93, 1.08, 1.28, 1.26, 0.99, 0.98, 1.02, 1.07, 1.…
$ x4046 [3m[38;5;246m<dbl>[39m[23m 1036.74, 674.28, 794.70, 1132.00, 941.48, 1184.27, 1368.92, 70…
$ x4225 [3m[38;5;246m<dbl>[39m[23m 54454.85, 44638.81, 109149.67, 71976.41, 43838.39, 48067.99, 7…
$ x4770 [3m[38;5;246m<dbl>[39m[23m 48.16, 58.33, 130.50, 72.58, 75.78, 43.61, 93.26, 80.00, 85.34…
$ small.bags [3m[38;5;246m<dbl>[39m[23m 8603.62, 9408.07, 8042.21, 5677.40, 5986.26, 6556.47, 8196.81,…
$ large.bags [3m[38;5;246m<dbl>[39m[23m 93.25, 97.49, 103.14, 133.76, 197.69, 127.44, 122.05, 562.37, …
$ xlarge.bags [3m[38;5;246m<dbl>[39m[23m 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.…
$ type [3m[38;5;246m<fct>[39m[23m conventional, conventional, conventional, conventional, conven…
$ year [3m[38;5;246m<dbl>[39m[23m 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 20…
$ region [3m[38;5;246m<fct>[39m[23m Albany, Albany, Albany, Albany, Albany, Albany, Albany, Albany…
# as the distance calculations require numerical imputs, we need to remove the factor variables
# from the dataset.
avocado_kmc$type <- NULL
avocado_kmc$region <- NULL
glimpse(avocado_kmc)
Observations: 18,249
Variables: 8
$ averageprice [3m[38;5;246m<dbl>[39m[23m 1.33, 1.35, 0.93, 1.08, 1.28, 1.26, 0.99, 0.98, 1.02, 1.07, 1.…
$ x4046 [3m[38;5;246m<dbl>[39m[23m 1036.74, 674.28, 794.70, 1132.00, 941.48, 1184.27, 1368.92, 70…
$ x4225 [3m[38;5;246m<dbl>[39m[23m 54454.85, 44638.81, 109149.67, 71976.41, 43838.39, 48067.99, 7…
$ x4770 [3m[38;5;246m<dbl>[39m[23m 48.16, 58.33, 130.50, 72.58, 75.78, 43.61, 93.26, 80.00, 85.34…
$ small.bags [3m[38;5;246m<dbl>[39m[23m 8603.62, 9408.07, 8042.21, 5677.40, 5986.26, 6556.47, 8196.81,…
$ large.bags [3m[38;5;246m<dbl>[39m[23m 93.25, 97.49, 103.14, 133.76, 197.69, 127.44, 122.05, 562.37, …
$ xlarge.bags [3m[38;5;246m<dbl>[39m[23m 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.…
$ year [3m[38;5;246m<dbl>[39m[23m 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 20…
# after selecting variables, we need to consider scaling. Good practive for k-means is to scale
# our data such that the mean is zero and the standard deviation is one. We can check that is the case using summary
summary(avocado_kmc)
averageprice x4046 x4225 x4770
Min. :0.440 Min. : 0 Min. : 0 Min. : 0
1st Qu.:1.100 1st Qu.: 854 1st Qu.: 3009 1st Qu.: 0
Median :1.370 Median : 8645 Median : 29061 Median : 185
Mean :1.406 Mean : 293008 Mean : 295155 Mean : 22840
3rd Qu.:1.660 3rd Qu.: 111020 3rd Qu.: 150207 3rd Qu.: 6243
Max. :3.250 Max. :22743616 Max. :20470573 Max. :2546439
small.bags large.bags xlarge.bags year
Min. : 0 Min. : 0 Min. : 0.0 Min. :2015
1st Qu.: 2849 1st Qu.: 127 1st Qu.: 0.0 1st Qu.:2015
Median : 26363 Median : 2648 Median : 0.0 Median :2016
Mean : 182195 Mean : 54338 Mean : 3106.4 Mean :2016
3rd Qu.: 83338 3rd Qu.: 22029 3rd Qu.: 132.5 3rd Qu.:2017
Max. :13384587 Max. :5719097 Max. :551693.7 Max. :2018
# some large numbers - so we need to scale
avocado_kmc <- avocado_kmc %>%
scale()
summary(avocado_kmc)
averageprice x4046 x4225 x4770
Min. :-2.39889 Min. :-0.2316 Min. :-0.2451 Min. :-0.2125
1st Qu.:-0.75986 1st Qu.:-0.2310 1st Qu.:-0.2426 1st Qu.:-0.2125
Median :-0.08935 Median :-0.2248 Median :-0.2210 Median :-0.2108
Mean : 0.00000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
3rd Qu.: 0.63083 3rd Qu.:-0.1439 3rd Qu.:-0.1204 3rd Qu.:-0.1544
Max. : 4.57941 Max. :17.7477 Max. :16.7553 Max. :23.4832
small.bags large.bags xlarge.bags year
Min. :-0.2442 Min. :-0.2227 Min. :-0.1756 Min. :-1.2212
1st Qu.:-0.2404 1st Qu.:-0.2222 1st Qu.:-0.1756 1st Qu.:-1.2212
Median :-0.2088 Median :-0.2119 Median :-0.1756 Median :-0.1573
Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
3rd Qu.:-0.1325 3rd Qu.:-0.1324 3rd Qu.:-0.1681 3rd Qu.: 0.9066
Max. :17.6933 Max. :23.2195 Max. :31.0061 Max. : 1.9704
clustered_avocado_kmc <- kmeans(avocado_kmc, 4)
clustered_avocado_kmc
K-means clustering with 4 clusters of sizes 6403, 6080, 169, 5597
Cluster means:
averageprice x4046 x4225 x4770 small.bags large.bags xlarge.bags
1 -0.9314643 0.1058285 0.07490033 0.1126751 0.07229344 0.07231949 0.05436323
2 0.4018306 -0.1651753 -0.14379337 -0.1669315 -0.12842205 -0.11894501 -0.09535043
3 -0.7796992 9.2632996 9.40080933 8.3364071 9.13719634 8.23495701 6.99855633
4 0.6526362 -0.2213420 -0.21333927 -0.1992796 -0.21909506 -0.20217685 -0.16993267
year
1 -4.325041e-01
2 1.122130e+00
3 3.242669e-05
4 -7.241793e-01
Clustering vector:
[1] 4 4 1 1 4 4 1 1 1 1 1 4 4 1 4 4 1 1 4 4 1 4 1 4 1 4 4 4 4 1 1 1 4 4 1 1 1 1 1 1
[41] 1 1 1 1 1 1 1 1 1 1 4 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[81] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[121] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[161] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[201] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 4 1 1 1 1 1 1 1 1 1
[241] 1 1 1 4 1 1 4 1 1 1 1 1 1 1 1 1 1 1 1 1 4 4 4 1 4 4 4 4 1 4 4 4 4 4 4 4 4 4 4 4
[281] 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 1 1 1 1 1 1 1 1
[321] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[361] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[401] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[441] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[481] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[521] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[561] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[601] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[641] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[681] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[721] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 4 1 1 1 1 1 1 1 1 1 1 1 4 4
[761] 1 4 1 1 1 1 1 1 1 4 1 1 4 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[801] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[841] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[881] 1 1 4 1 4 1 1 1 4 4 1 1 1 1 1 4 4 4 4 1 4 4 1 1 1 1 4 4 1 1 4 1 1 4 4 4 1 1 1 4
[921] 4 4 4 1 1 4 4 1 4 4 1 4 4 4 4 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[961] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[ reached getOption("max.print") -- omitted 17249 entries ]
Within cluster sum of squares by cluster:
[1] 20289.948 8668.976 14428.181 4366.963
(between_SS / total_SS = 67.3 %)
Available components:
[1] "cluster" "centers" "totss" "withinss" "tot.withinss"
[6] "betweenss" "size" "iter" "ifault"
set.seed(2345)
library(animation)
index <- c(1,8)
kmeans.ani(avocado_kmc[,index], 4)